{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:01.928739Z",
     "iopub.status.busy": "2022-07-08T13:06:01.928574Z",
     "iopub.status.idle": "2022-07-08T13:06:02.166957Z",
     "shell.execute_reply": "2022-07-08T13:06:02.166426Z",
     "shell.execute_reply.started": "2022-07-08T13:06:01.928723Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.167787Z",
     "iopub.status.busy": "2022-07-08T13:06:02.167621Z",
     "iopub.status.idle": "2022-07-08T13:06:02.434085Z",
     "shell.execute_reply": "2022-07-08T13:06:02.433175Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.167772Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('./基于论文摘要的文本分类与查询性问答公开数据/train.csv', sep=',')\n",
    "test_df = pd.read_csv('./基于论文摘要的文本分类与查询性问答公开数据/test.csv', sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.434942Z",
     "iopub.status.busy": "2022-07-08T13:06:02.434774Z",
     "iopub.status.idle": "2022-07-08T13:06:02.447495Z",
     "shell.execute_reply": "2022-07-08T13:06:02.447024Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.434927Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Citation</th>\n",
       "      <th>Abstract</th>\n",
       "      <th>DOI</th>\n",
       "      <th>Topic(Label)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  The Va...</td>\n",
       "      <td>['Seyhmus Tunc', 'Suleyman Cemil Oglak', 'Fatm...</td>\n",
       "      <td>2022 Jun;32(6):722-727.</td>\n",
       "      <td>\\n\\n\\n          Objective:\\n        \\n      \\n...</td>\n",
       "      <td>\\n        doi: 10.29271/jcpsp.2022.06.722.\\n  ...</td>\n",
       "      <td>Abdominal+Fat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Metfor...</td>\n",
       "      <td>['Katrin Schmitz', 'Eva-Maria Turnwald', 'Tobi...</td>\n",
       "      <td>2022 May 30;14(11):2288.</td>\n",
       "      <td>\\n\\n      \\n      With the gaining prevalence ...</td>\n",
       "      <td>\\n        doi: 10.3390/nu14112288.\\n</td>\n",
       "      <td>Abdominal+Fat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  New In...</td>\n",
       "      <td>['Jorge Valencia-Ortega', 'Rebeca GonzÃ¡lez-Re...</td>\n",
       "      <td>2022 Jun 3;23(11):6279.</td>\n",
       "      <td>\\n\\n      \\n      Gestational diabetes mellitu...</td>\n",
       "      <td>\\n        doi: 10.3390/ijms23116279.\\n</td>\n",
       "      <td>Abdominal+Fat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Î±-Lin...</td>\n",
       "      <td>['Victoria Aiassa', 'MarÃ­a Del Rosario Ferrei...</td>\n",
       "      <td>2022 Jun;156:111164.</td>\n",
       "      <td>\\n\\n      \\n      Given obesity and its associ...</td>\n",
       "      <td>\\n        doi: 10.1016/j.foodres.2022.111164.\\...</td>\n",
       "      <td>Abdominal+Fat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Perire...</td>\n",
       "      <td>['Ming Jiang', 'Menghuan Li', 'Cuiying Liu', '...</td>\n",
       "      <td>2022 May 6;13:865009.</td>\n",
       "      <td>\\n\\n\\n          Background:\\n        \\n      \\...</td>\n",
       "      <td>\\n        doi: 10.3389/fendo.2022.865009.\\n</td>\n",
       "      <td>Abdominal+Fat</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Title  \\\n",
       "0  \\n  \\n    \\n    \\n    \\n    \\n      \\n  The Va...   \n",
       "1  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Metfor...   \n",
       "2  \\n  \\n    \\n    \\n    \\n    \\n      \\n  New In...   \n",
       "3  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Î±-Lin...   \n",
       "4  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Perire...   \n",
       "\n",
       "                                             Authors  \\\n",
       "0  ['Seyhmus Tunc', 'Suleyman Cemil Oglak', 'Fatm...   \n",
       "1  ['Katrin Schmitz', 'Eva-Maria Turnwald', 'Tobi...   \n",
       "2  ['Jorge Valencia-Ortega', 'Rebeca GonzÃ¡lez-Re...   \n",
       "3  ['Victoria Aiassa', 'MarÃ­a Del Rosario Ferrei...   \n",
       "4  ['Ming Jiang', 'Menghuan Li', 'Cuiying Liu', '...   \n",
       "\n",
       "                   Citation  \\\n",
       "0   2022 Jun;32(6):722-727.   \n",
       "1  2022 May 30;14(11):2288.   \n",
       "2   2022 Jun 3;23(11):6279.   \n",
       "3      2022 Jun;156:111164.   \n",
       "4     2022 May 6;13:865009.   \n",
       "\n",
       "                                            Abstract  \\\n",
       "0  \\n\\n\\n          Objective:\\n        \\n      \\n...   \n",
       "1  \\n\\n      \\n      With the gaining prevalence ...   \n",
       "2  \\n\\n      \\n      Gestational diabetes mellitu...   \n",
       "3  \\n\\n      \\n      Given obesity and its associ...   \n",
       "4  \\n\\n\\n          Background:\\n        \\n      \\...   \n",
       "\n",
       "                                                 DOI   Topic(Label)  \n",
       "0  \\n        doi: 10.29271/jcpsp.2022.06.722.\\n  ...  Abdominal+Fat  \n",
       "1         \\n        doi: 10.3390/nu14112288.\\n        Abdominal+Fat  \n",
       "2       \\n        doi: 10.3390/ijms23116279.\\n        Abdominal+Fat  \n",
       "3  \\n        doi: 10.1016/j.foodres.2022.111164.\\...  Abdominal+Fat  \n",
       "4  \\n        doi: 10.3389/fendo.2022.865009.\\n        Abdominal+Fat  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.448374Z",
     "iopub.status.busy": "2022-07-08T13:06:02.448214Z",
     "iopub.status.idle": "2022-07-08T13:06:02.521551Z",
     "shell.execute_reply": "2022-07-08T13:06:02.521121Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.448359Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Title</th>\n",
       "      <th>Authors</th>\n",
       "      <th>Citation</th>\n",
       "      <th>Abstract</th>\n",
       "      <th>DOI</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Gut mi...</td>\n",
       "      <td>['Xiaomin Su', 'Minying Zhang', 'Houbao Qi', '...</td>\n",
       "      <td>2022 Jan 24;10(1):13.</td>\n",
       "      <td>\\n\\n\\n          Background:\\n        \\n      \\...</td>\n",
       "      <td>\\n        doi: 10.1186/s40168-021-01205-8.\\n  ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  [A spa...</td>\n",
       "      <td>['Naigong Yu', 'Yishen Liao', 'Naigong Yu', 'Y...</td>\n",
       "      <td>2022 Apr 25;39(2):217-227.</td>\n",
       "      <td>\\n\\n      \\n      Physiological studies reveal...</td>\n",
       "      <td>\\n        doi: 10.7507/1001-5515.202109051.\\n ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  A muci...</td>\n",
       "      <td>['Ju-Hyung Lee', 'Soo-Jeong Kwon', 'Ji-Yoon Ha...</td>\n",
       "      <td>2022 Feb;60(2):215-223.</td>\n",
       "      <td>\\n\\n      \\n      The mammalian intestinal tra...</td>\n",
       "      <td>\\n        doi: 10.1007/s12275-022-1649-3.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Progre...</td>\n",
       "      <td>['Michal Å tros', 'Eva VolfovÃ¡ PolanskÃ¡', 'T...</td>\n",
       "      <td>2022 Apr 5;12(4):544.</td>\n",
       "      <td>\\n\\n      \\n      Extracellular HMGB1 protein ...</td>\n",
       "      <td>\\n        doi: 10.3390/biom12040544.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n  \\n    \\n    \\n    \\n    \\n      \\n  Microb...</td>\n",
       "      <td>['Jing Ouyang', 'Silvere D Zaongo', 'Xue Zhang...</td>\n",
       "      <td>2022 Jan 6;12:755890.</td>\n",
       "      <td>\\n\\n      \\n      Hepatitis B virus (HBV) co-i...</td>\n",
       "      <td>\\n        doi: 10.3389/fimmu.2021.755890.\\n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Title  \\\n",
       "0  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Gut mi...   \n",
       "1  \\n  \\n    \\n    \\n    \\n    \\n      \\n  [A spa...   \n",
       "2  \\n  \\n    \\n    \\n    \\n    \\n      \\n  A muci...   \n",
       "3  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Progre...   \n",
       "4  \\n  \\n    \\n    \\n    \\n    \\n      \\n  Microb...   \n",
       "\n",
       "                                             Authors  \\\n",
       "0  ['Xiaomin Su', 'Minying Zhang', 'Houbao Qi', '...   \n",
       "1  ['Naigong Yu', 'Yishen Liao', 'Naigong Yu', 'Y...   \n",
       "2  ['Ju-Hyung Lee', 'Soo-Jeong Kwon', 'Ji-Yoon Ha...   \n",
       "3  ['Michal Å tros', 'Eva VolfovÃ¡ PolanskÃ¡', 'T...   \n",
       "4  ['Jing Ouyang', 'Silvere D Zaongo', 'Xue Zhang...   \n",
       "\n",
       "                     Citation  \\\n",
       "0       2022 Jan 24;10(1):13.   \n",
       "1  2022 Apr 25;39(2):217-227.   \n",
       "2     2022 Feb;60(2):215-223.   \n",
       "3       2022 Apr 5;12(4):544.   \n",
       "4       2022 Jan 6;12:755890.   \n",
       "\n",
       "                                            Abstract  \\\n",
       "0  \\n\\n\\n          Background:\\n        \\n      \\...   \n",
       "1  \\n\\n      \\n      Physiological studies reveal...   \n",
       "2  \\n\\n      \\n      The mammalian intestinal tra...   \n",
       "3  \\n\\n      \\n      Extracellular HMGB1 protein ...   \n",
       "4  \\n\\n      \\n      Hepatitis B virus (HBV) co-i...   \n",
       "\n",
       "                                                 DOI  \n",
       "0  \\n        doi: 10.1186/s40168-021-01205-8.\\n  ...  \n",
       "1  \\n        doi: 10.7507/1001-5515.202109051.\\n ...  \n",
       "2  \\n        doi: 10.1007/s12275-022-1649-3.\\n        \n",
       "3       \\n        doi: 10.3390/biom12040544.\\n        \n",
       "4  \\n        doi: 10.3389/fimmu.2021.755890.\\n        "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.522871Z",
     "iopub.status.busy": "2022-07-08T13:06:02.522706Z",
     "iopub.status.idle": "2022-07-08T13:06:02.592016Z",
     "shell.execute_reply": "2022-07-08T13:06:02.591211Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.522856Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df['Topic(Label)'], lbl = pd.factorize(train_df['Topic(Label)'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.593238Z",
     "iopub.status.busy": "2022-07-08T13:06:02.593034Z",
     "iopub.status.idle": "2022-07-08T13:06:02.793459Z",
     "shell.execute_reply": "2022-07-08T13:06:02.792930Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.593220Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df['Title'] = train_df['Title'].apply(lambda x: x.strip())\n",
    "train_df['Abstract'] = train_df['Abstract'].fillna('').apply(lambda x: x.strip())\n",
    "train_df['text'] = train_df['Title'] + ' ' + train_df['Abstract']\n",
    "train_df['text'] = train_df['text'].str.lower()\n",
    "\n",
    "test_df['Title'] = test_df['Title'].apply(lambda x: x.strip())\n",
    "test_df['Abstract'] = test_df['Abstract'].fillna('').apply(lambda x: x.strip())\n",
    "test_df['text'] = test_df['Title'] + ' ' + test_df['Abstract']\n",
    "test_df['text'] = test_df['text'].str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:02.794456Z",
     "iopub.status.busy": "2022-07-08T13:06:02.794183Z",
     "iopub.status.idle": "2022-07-08T13:06:03.268439Z",
     "shell.execute_reply": "2022-07-08T13:06:03.267860Z",
     "shell.execute_reply.started": "2022-07-08T13:06:02.794439Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "tfidf = TfidfVectorizer(max_features=2500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:03.269338Z",
     "iopub.status.busy": "2022-07-08T13:06:03.269165Z",
     "iopub.status.idle": "2022-07-08T13:06:07.962686Z",
     "shell.execute_reply": "2022-07-08T13:06:07.962198Z",
     "shell.execute_reply.started": "2022-07-08T13:06:03.269322Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lyz/.local/lib/python3.6/site-packages/sklearn/model_selection/_split.py:668: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
      "  % (min_groups, self.n_splits)), UserWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0.88821569, 0.88245986, 0.88882157, 0.89215389, 0.8769697 ])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_tfidf = tfidf.fit_transform(train_df['text'])\n",
    "\n",
    "clf = SGDClassifier()\n",
    "cross_val_score(clf, train_tfidf, train_df['Topic(Label)'], cv=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:07.963693Z",
     "iopub.status.busy": "2022-07-08T13:06:07.963528Z",
     "iopub.status.idle": "2022-07-08T13:06:08.896253Z",
     "shell.execute_reply": "2022-07-08T13:06:08.895590Z",
     "shell.execute_reply.started": "2022-07-08T13:06:07.963678Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_tfidf = tfidf.transform(test_df['text'])\n",
    "\n",
    "clf = SGDClassifier()\n",
    "clf.fit(train_tfidf, train_df['Topic(Label)'])\n",
    "test_df['Topic(Label)'] = clf.predict(test_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-07-08T13:06:08.897152Z",
     "iopub.status.busy": "2022-07-08T13:06:08.896988Z",
     "iopub.status.idle": "2022-07-08T13:06:08.905681Z",
     "shell.execute_reply": "2022-07-08T13:06:08.905282Z",
     "shell.execute_reply.started": "2022-07-08T13:06:08.897137Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_df['Topic(Label)'] = test_df['Topic(Label)'].apply(lambda x: lbl[x])\n",
    "test_df[['Topic(Label)']].to_csv('submit.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
